Package org.terrier.indexing.hadoop

Source Code of org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is SortAscendingTripleVectors.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*  Craig Macdonald <craigm{a.}dcs.gla.ac.uk
*  Richard McCreadie <richardm{a.}dcs.gla.ac.uk
*/

package org.terrier.indexing.hadoop;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TaskAttemptID;

import org.terrier.compression.BitIn;
import org.terrier.compression.BitOutputStream;
import org.terrier.indexing.BasicSinglePassIndexer;
import org.terrier.indexing.Document;
import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.CompressingMetaIndexBuilder;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.MetaIndexBuilder;
import org.terrier.structures.indexing.singlepass.FieldPostingInRun;
import org.terrier.structures.indexing.singlepass.RunsMerger;
import org.terrier.structures.indexing.singlepass.SimplePostingInRun;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunIteratorFactory;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger;
import org.terrier.structures.indexing.singlepass.hadoop.IDComparator;
import org.terrier.structures.indexing.singlepass.hadoop.MapData;
import org.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList;
import org.terrier.structures.indexing.singlepass.hadoop.SplitAwareWrapper;
import org.terrier.structures.indexing.singlepass.hadoop.SplitEmittedTerm;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.io.HadoopPlugin;
import org.terrier.utility.io.HadoopUtility;
import org.terrier.utility.io.WrappedIOException;
import org.terrier.utility.io.HadoopPlugin.JobFactory;

/**
* Single Pass MapReduce indexer.
* <p><h3>Map phase processing</h3>
* Indexes as a Map task, taking in a series of documents, emitting posting lists for terms as
* memory becomes exhausted. Two side-files are created for each map task: the first (run files) takes note of how many documents were indexed
* for each flush and for each map; the second contains the statistics for each document in a minature document index
* </p>
* <p><h3>Reduce phase processing</h3>
* All posting lists for each term are read in, one term at a time. Using the run files, the posting lists are output into the final inverted
* file, with all document ids corrected. Lastly, when all terms have been processed, the document indexes are merged into the final document
* index, and the lexicon hash and lexid created.
* </p>
* <p><h3>Partitioned Reduce processing</h3>
* Normally, the MapReduce indexer is used with a single reducer. However, if the partitioner is used, multiple reduces can run concurrently,
* building several final indices. In doing so, a large collection can be indexed into several output indices, which may be useful for distributed
* retrieval.
* </p>
* @author Richard McCreadie and Craig Macdonald
* @since 2.2
  */
@SuppressWarnings("deprecation")
public class Hadoop_BasicSinglePassIndexer
  extends BasicSinglePassIndexer
  implements Mapper<Text, SplitAwareWrapper<Document>, SplitEmittedTerm, MapEmittedPostingList>,
  Reducer<SplitEmittedTerm, MapEmittedPostingList, Object, Object>
{
  /**
   * main
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception
    {
        if (args.length == 2 && args[0].equals("--finish"))
        {
            final JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
            if (jf == null)
                throw new Exception("Could not get JobFactory from HadoopPlugin");
            try{
              finish(ApplicationSetup.TERRIER_INDEX_PATH, Integer.parseInt(args[1]), jf);
            } catch (Exception e) {
              logger.error("Couldn't finish index", e);
            } finally {
              jf.close();
            }
        }
        else
        {
          System.err.println("Usage: Hadoop_BasicSinglePassIndexer [--finish numberOfReduceTasks]");
        }
    }
  /**
   * finish
   * @param destinationIndexPath
   * @param numberOfReduceTasks
   * @param jf
   * @throws Exception
   */
  public static void finish(final String destinationIndexPath, int numberOfReduceTasks, final JobFactory jf) throws Exception
  {
    final String[] reverseMetaKeys = ApplicationSetup.getProperty("indexer.meta.reverse.keys", "docno").split("\\s*,\\s*");
    Index.setIndexLoadingProfileAsRetrieval(false);
    if (numberOfReduceTasks == 1)
    {     
      Index index = Index.createIndex(destinationIndexPath, ApplicationSetup.TERRIER_INDEX_PREFIX);
      if (index == null)
      {
        throw new IOException("No such index ["+destinationIndexPath+","+ApplicationSetup.TERRIER_INDEX_PREFIX+"]");
      }
      CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
      index.close();
      return;
    }
    //make a list of MR jobs in separate threads
    List<Thread> threads = new ArrayList<Thread>(numberOfReduceTasks);
    for(int i=0;i<numberOfReduceTasks;i++)
    {
      final int id = i;
      threads.add(new Thread() {
          @Override
          public void run() {
            try{
              Index index = Index.createIndex(destinationIndexPath, ApplicationSetup.TERRIER_INDEX_PREFIX+"-"+id);
              CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
              index.close();
            } catch (Exception e) {
              logger.error("Problem finishing meta", e);
              e.printStackTrace();
            }
          }       
        });     
    }
    //start the threads
    for(Thread t : threads)
      t.start();
    //wait for the threads to end
    for(Thread t : threads)
      t.join();
  }
 
  static enum Counters {
    INDEXED_DOCUMENTS, INDEXED_EMPTY_DOCUMENTS, INDEXER_FLUSHES, INDEXED_TOKENS, INDEXED_POINTERS;
  };
 
  /** JobConf of the current running job */ 
  protected JobConf jc;

  /** The split that these documents came form **/
  protected int splitnum;
  protected boolean start;
 
  /**
   * Empty constructor.
   */
  public Hadoop_BasicSinglePassIndexer() {
    super(0,0,0);
    numberOfDocuments = currentId = numberOfDocsSinceCheck = numberOfDocsSinceFlush = numberOfUniqueTerms = 0;
    numberOfTokens = numberOfPointers = 0;
    flushNo=0;
    flushList = new LinkedList<Integer>();
  }
 
  /** Configure this indexer. Firstly, loads ApplicationSetup appropriately.
   * Actual configuration of indexer is then handled by configureMap() or configureReduce()
   * depending on whether a Map or Reduce task is being configured.
   * @param _jc The configuration for the job
   */
  public void configure(JobConf _jc)
  {
    this.jc = _jc;
   
    //1. configure application
    try{
      HadoopUtility.loadTerrierJob(_jc);
    } catch (Exception e) {
      throw new Error("Cannot load ApplicationSetup", e);
    }
   
    //2. configurure indexer
    try{
      if (HadoopUtility.isMap(_jc))
      {
        configureMap();
      } else {
        configureReduce();
      }
    } catch (Exception e) {
      throw new Error("Cannot configure indexer", e);
    }
  }
 
  /** Called when the Map or Reduce task ends, to finish up the indexer. Actual cleanup is
   * handled by closeMap() or closeReduce() depending on whether this is a Map or Reduce task.
   */
  public void close() throws IOException
  {
    if (HadoopUtility.isMap(jc))
    {
      closeMap();
    } else {
      closeReduce();
    }
  }
 
  @Override
  /** Hadoop indexer does not have the consideration of boundary documents. */
  protected void load_builder_boundary_documents() { }
 

  /* ==============================================================
   * Map implementation from here down
   * ==============================================================
   */
 
  /** output collector for the current map indexing process */
  protected OutputCollector<SplitEmittedTerm, MapEmittedPostingList> outputPostingListCollector;
 
  /** Current map number */
  protected String mapTaskID;
  /** How many flushes have we made */
  protected int flushNo;

  /** OutputStream for the the data on the runs (runNo, flushes etc) */
  protected DataOutputStream RunData;
  /** List of how many documents are in each flush we have made */
  protected LinkedList<Integer> flushList;
 
  protected void configureMap() throws Exception
 
    super.init();
    Path indexDestination = FileOutputFormat.getWorkOutputPath(jc);
    Files.mkdir(indexDestination.toString());
    mapTaskID = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().toString();
    currentIndex = Index.createNewIndex(indexDestination.toString(), mapTaskID);
    maxMemory = Long.parseLong(ApplicationSetup.getProperty("indexing.singlepass.max.postings.memory", "0"));
    //during reduce, we dont want to load indices into memory, as we only use
    //them as streams
    currentIndex.setIndexProperty("index.preloadIndices.disabled", "true");
    RunData = new DataOutputStream(
        Files.writeFileStream(
            new Path(indexDestination, mapTaskID+".runs").toString())
        );
    RunData.writeUTF(mapTaskID);
    start = true;
    createMemoryPostings();
    super.emptyDocIndexEntry = new SimpleDocumentIndexEntry();
    super.docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    super.metaBuilder = createMetaIndexBuilder();
    emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new SimpleDocumentIndexEntry();
  }
 
 
 
  protected MetaIndexBuilder createMetaIndexBuilder()
  {
    final String[] forwardMetaKeys = ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno").split("\\s*,\\s*");
    final int[] metaKeyLengths = parseInts(ApplicationSetup.getProperty("indexer.meta.forward.keylens", "20").split("\\s*,\\s*"));
    //no reverse metadata during main indexing, pick up as separate job later
    return new CompressingMetaIndexBuilder(currentIndex, forwardMetaKeys, metaKeyLengths, new String[0]);
  }
 
  @edu.umd.cs.findbugs.annotations.SuppressWarnings(
      value="DM_GC",
      justification="Forcing GC is an essential part of releasing" +
          "memory for further indexing")
  /** causes the posting lists built up in memory to be flushed out */
  protected void forceFlush() throws IOException
  {
    //logger.info("Map "+mapTaskID+", flush requested, containing "+numberOfDocsSinceFlush+" documents, flush "+flushNo);
    if (mp == null)
      throw new IOException("Map flushed before any documents were indexed");
    mp.finish(new HadoopRunWriter(outputPostingListCollector, mapTaskID, splitnum, flushNo));
    RunData.writeInt(currentId);
    if (currentReporter != null)
      currentReporter.incrCounter(Counters.INDEXER_FLUSHES, 1);
    System.gc();
    createMemoryPostings();
    memoryCheck.reset();
    numberOfDocsSinceFlush = 0;
    currentId = 0;
    flushNo++;
 
 
  /**
   * Map processes a single document. Stores the terms in the document along with the posting list
   * until memory is full or all documents in this map have been processed then writes then to
   * the output collector. 
   * @param key - Wrapper for Document Number
   * @param value - Wrapper for Document Object
   * @param _outputPostingListCollector Collector for emitting terms and postings lists
   * @throws IOException
   */
  public void map(
      Text key, SplitAwareWrapper<Document> value,
      OutputCollector<SplitEmittedTerm, MapEmittedPostingList> _outputPostingListCollector,
      Reporter reporter)
    throws IOException
  {
    final String docno = key.toString();
    currentReporter = reporter;
    reporter.setStatus("Currently indexing "+docno);
    final Document doc = value.getObject();
   
    if (start) {
      splitnum = value.getSplitIndex();
      System.out.println(splitnum);
      //RunData.writeInt(splitnum);
      start = false;
    }
   
    this.outputPostingListCollector = _outputPostingListCollector;
   
    /* setup for parsing */
    createDocumentPostings();
    String term;//term we're currently processing
    numOfTokensInDocument = 0;
    //numberOfDocuments++;
    //get each term in the document
    while (!doc.endOfDocument()) {
      reporter.progress();
      if ((term = doc.getNextTerm())!=null && !term.equals("")) {
        termFields = doc.getFields();
        /* pass term into TermPipeline (stop, stem etc) */
        pipeline_first.processTerm(term);

        /* the term pipeline will eventually add the term to this object. */
      }
      if (MAX_TOKENS_IN_DOCUMENT > 0 &&
          numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
        break;
    }
   
    //if we didn't index all tokens from document,
    //we need tocurrentId get to the end of the document.
    while (!doc.endOfDocument()){
      doc.getNextTerm();
    }
    /* we now have all terms in the DocumentTree, so we save the document tree */
    if (termsInDocument.getDocumentLength() == 0)
    {  /* this document is empty, add the minimum to the document index */
      // Nothing in the ifile
      indexEmpty(doc.getAllProperties());
    }
    else
    /* index this document */
      try{
        indexDocument(doc.getAllProperties(), termsInDocument);
        numberOfTokens += numOfTokensInDocument;
        reporter.incrCounter(Counters.INDEXED_TOKENS, numOfTokensInDocument);
        reporter.incrCounter(Counters.INDEXED_POINTERS, termsInDocument.getNumberOfPointers());
      } catch (IOException ioe) {
        throw ioe;       
      } catch (Exception e) {
        throw new WrappedIOException(e);
      }
    }
    termsInDocument.clear();
    reporter.incrCounter(Counters.INDEXED_DOCUMENTS, 1);
  }
 
  protected Reporter currentReporter;
 
  /**
   * Write the empty document to the inverted index
   */
  protected void indexEmpty(final Map<String,String> docProperties) throws IOException
  {
    /* add doc to documentindex, even though it's empty */
    if(IndexEmptyDocuments)
    { 
      //logger.warn("Adding empty document "+docProperties.get("docno"));
      docIndexBuilder.addEntryToBuffer(emptyDocIndexEntry);
      metaBuilder.writeDocumentEntry(docProperties);
      currentId++;
      numberOfDocuments++;
      currentReporter.incrCounter(Counters.INDEXED_EMPTY_DOCUMENTS, 1);
    }
  }
 
  /** Finish up the map processing. Forces a flush, then writes out the final run data */
  protected void closeMap() throws IOException
  {
    forceFlush();
    docIndexBuilder.finishedCollections();
    currentIndex.setIndexProperty("index.inverted.fields.count", ""+FieldScore.FIELDS_COUNT);
    if (FieldScore.FIELDS_COUNT > 0)
    {
      currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
    }
    else
    {
      currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
    }
    metaBuilder.close();
    currentIndex.flush();
    currentIndex.close();
    RunData.writeInt(-1);
    RunData.writeInt(numberOfDocuments);
    RunData.writeInt(splitnum);
    RunData.close();
    //logger.info("Map "+mapTaskID+ " finishing, indexed "+numberOfDocuments+ " in "+(flushNo-1)+" flushes");
  }

  /* ==============================================================
   * Reduce implementation from here down
   * ==============================================================
   */
 
  /** OutputStream for the Lexicon*/
  protected LexiconOutputStream<String> lexstream;
  /** runIterator factory being used to generate RunIterators */
  protected HadoopRunIteratorFactory runIteratorF = null;
  /** records whether the reduce() has been called for the first time */
  protected boolean reduceStarted = false;
 
  protected boolean mutipleIndices = true;
  protected int reduceId;
  protected String[] MapIndexPrefixes = null;
  protected Reporter lastReporter = null;
 
  protected void configureReduce() throws Exception
 
    super.init();
    start = true;
    //load in the current index
    final Path indexDestination = FileOutputFormat.getWorkOutputPath(jc);
    Files.mkdir(path = indexDestination.toString());
    final String indexDestinationPrefix = jc.get("indexing.hadoop.prefix", "data");
    reduceId = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().getId();
    indexDestination.toString();
    mutipleIndices = jc.getBoolean("indexing.hadoop.multiple.indices", true);
    if (jc.getNumReduceTasks() > 1)
    {
      //gets the reduce number and suffices this to the index prefix
      prefix = indexDestinationPrefix + "-"+reduceId;
    }
    else
    {
      prefix = indexDestinationPrefix;
    }
   
    currentIndex = Index.createNewIndex(path, prefix);
   
    super.merger = createtheRunMerger();
    reduceStarted = false
  }
 
  protected LinkedList<MapData> loadRunData() throws IOException
  {
    // Load in Run Data
    ArrayList<String> mapTaskIDs = new ArrayList<String>();
    final LinkedList<MapData> runData = new LinkedList<MapData>();
    DataInputStream runDataIn;
 
    final String jobId = TaskAttemptID.forName(jc.get("mapred.task.id")).getJobID().toString().replaceAll("job", "task");
   
    final FileStatus[] files = FileSystem.get(jc).listStatus(
      FileOutputFormat.getOutputPath(jc),
      new org.apache.hadoop.fs.PathFilter()
      {
        public boolean accept(Path path)
        {         
          final String name = path.getName();
          //1. is this a run file
          if (!name.startsWith( jobId && name.endsWith(".runs")))
            return false;
          return true;
        }
      }
    );

    if (files == null || files.length == 0)
    {
      throw new IOException("No run status files found in "+FileOutputFormat.getOutputPath(jc));
    }
   
    final int thisPartition = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().getId();
    final SplitEmittedTerm.SETPartitioner partitionChecker = new SplitEmittedTerm.SETPartitioner();
    partitionChecker.configure(jc);
   
   
    MapData tempHRD;
    for (FileStatus file : files)
    {
      //logger.info("Run data file "+ file.getPath().toString()+" has length "+Files.length(file.getPath().toString()));
      runDataIn = new DataInputStream(Files.openFileStream(file.getPath().toString()));
      tempHRD = new MapData(runDataIn);
      //check to see if this file contaned our split information
      if (mutipleIndices && partitionChecker.calculatePartition(tempHRD.getSplitnum(), jc.getNumReduceTasks()) != thisPartition)
        continue;
     
      mapTaskIDs.add(tempHRD.getMap());
      runData.add(tempHRD);
      runDataIn.close();
    }
    // Sort by splitnum
    Collections.sort(runData);
    Collections.sort(mapTaskIDs, new IDComparator(runData));
    // A list of the index shards
    MapIndexPrefixes = mapTaskIDs.toArray(new String[0]);
    return runData;
  }
 
  /**
   * Merge the postings for the current term, converts the document ID's in the
   * postings to be relative to one another using the run number, number of documents
   * covered in each run, the flush number for that run and the number of documents
   * flushed.
   * @param mapData - info about the runs(maps) and the flushes
   */
  public void startReduce(LinkedList<MapData> mapData) throws IOException
  {
    //logger.info("The number of Reduce Tasks being used : "+jc.getNumReduceTasks());
    ((HadoopRunsMerger)(super.merger)).beginMerge(mapData);
    this.currentIndex.setIndexProperty("max.term.length", ApplicationSetup.getProperty("max.term.length", ""+20));
    lexstream = new FSOMapFileLexiconOutputStream(this.currentIndex, "lexicon",
        (FieldScore.FIELDS_COUNT  > 0 ? FieldLexiconEntry.Factory.class : BasicLexiconEntry.Factory.class));
    // Tell the merger how many to Reducers to merge for
    ((HadoopRunsMerger) merger).setNumReducers(
        mutipleIndices ? jc.getNumReduceTasks() : 1);
  }
 
  /** Main reduce algorithm step. Called for every term in the merged index, together with accessors
   * to the posting list information that has been written.
   * This reduce has no output.
   * @param Term indexing term which we are reducing the posting lists into
   * @param postingIterator Iterator over the temporary posting lists we have for this term
   * @param output Unused output collector
   * @param reporter Used to report progress
   */
  public void reduce(
      SplitEmittedTerm Term,
      Iterator<MapEmittedPostingList> postingIterator,
      OutputCollector<Object, Object> output,
      Reporter reporter)
    throws IOException
  {
    //if (logger.isDebugEnabled()) logger.debug("Reduce for term "+Term.getText());
    reporter.setStatus("Reducer is merging term " + Term.getTerm());
    if (! reduceStarted)
    {
      final LinkedList<MapData> runData = loadRunData();
          startReduce(runData);
      reduceStarted = true;
    }
    String term = Term.getTerm().trim();
    if (term.length() == 0)
      return;
    runIteratorF.setRunPostingIterator(postingIterator);
    runIteratorF.setTerm(term);
    try{
      merger.mergeOne(lexstream);
    } catch (Exception e) {
      throw new WrappedIOException(e);
    }
    reporter.progress();
    this.lastReporter = reporter;
  }

  /** Merges the simple document indexes made for each map, instead creating the final document index */ 
  @SuppressWarnings("unchecked")
  protected void mergeDocumentIndex(Index[] src) throws IOException
  {
    //logger.info("Merging document and meta indices");
    final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(currentIndex, "document");
    final MetaIndexBuilder metaBuilder = this.createMetaIndexBuilder();
    int i_index = 0;
    int docCount =-1;
    for (Index srcIndex: src)
    {
      final Iterator<DocumentIndexEntry> docidInput = (Iterator<DocumentIndexEntry>)srcIndex.getIndexStructureInputStream("document");
      final Iterator<String[]> metaInput1 = (Iterator<String[]>)srcIndex.getIndexStructureInputStream("meta");
        while (docidInput.hasNext())
      {
        docCount++;
        docidOutput.addEntryToBuffer(docidInput.next());
            metaBuilder.writeDocumentEntry(metaInput1.next());
            this.lastReporter.progress();
      }
        IndexUtil.close(docidInput);
        IndexUtil.close(metaInput1);
        i_index++;
    }
    metaBuilder.close();
    docidOutput.finishedCollections();
    if (FieldScore.FIELDS_COUNT > 0)
    {
      currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
    }
    else
    {
      currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", "");
    }
    //logger.info("Finished merging document indices from "+src.length+" map tasks: "+docCount +" documents found");
  }

  /** finishes the reduce step, by closing the lexicon and inverted file output,
     * building the lexicon hash and index, and merging the document indices created
    * by the map tasks. The output index finalised */
  protected void closeReduce() throws IOException {
   
    if (! reduceStarted)
    {
      //logger.warn("No terms were input, skipping reduce close");
      return;
    }
    //generate final index structures
    //1. any remaining lexicon terms
    merger.endMerge(lexstream);
    //2. the end of the inverted file
    merger.getBos().close();
    lexstream.close();
   
   
    //index updating is ONLY for
    currentIndex.addIndexStructure(
        "inverted",
        invertedIndexClass,
        "org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class",
        "index,structureName,document,"+
          (FieldScore.FIELDS_COUNT > 0
            ? fieldInvertedIndexPostingIteratorClass
            : basicInvertedIndexPostingIteratorClass ));
    currentIndex.addIndexStructureInputStream(
                "inverted",
                invertedIndexInputStreamClass,
                "org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
                "index,structureName,lexicon-entry-inputstream,"+
                  (FieldScore.FIELDS_COUNT > 0
            ? fieldInvertedIndexPostingIteratorClass
            : basicInvertedIndexPostingIteratorClass ));
    currentIndex.setIndexProperty("index.inverted.fields.count", ""+FieldScore.FIELDS_COUNT );
    currentIndex.setIndexProperty("index.inverted.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
   
   
    //3. finalise the lexicon
    currentIndex.setIndexProperty("num.Terms",""+ lexstream.getNumberOfTermsWritten() );
    currentIndex.setIndexProperty("num.Tokens",""+lexstream.getNumberOfTokensWritten() );
    currentIndex.setIndexProperty("num.Pointers",""+lexstream.getNumberOfPointersWritten() );
    if (FieldScore.FIELDS_COUNT > 0)
      currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}");
    this.finishedInvertedIndexBuild();
     
   
    //the document indices are only merged if we are creating multiple indices
    //OR if this is the first reducer for a job creating a single index
    if (mutipleIndices || reduceId == 0)
    {
      //4. document index
      Index[] sourceIndices = new Index[MapIndexPrefixes.length];
       for (int i= 0; i<MapIndexPrefixes.length;i++)
      {
        sourceIndices[i] = Index.createIndex(FileOutputFormat.getOutputPath(jc).toString(), MapIndexPrefixes[i]);
        if (sourceIndices[i] == null)
          throw new IOException("Could not load index from ("
            +FileOutputFormat.getOutputPath(jc).toString()+","+ MapIndexPrefixes[i] +") because "
            +Index.getLastIndexLoadError());
      }
       this.mergeDocumentIndex(sourceIndices);
      
       //5. close the map phase indices
      for(Index i : sourceIndices)
      {
        i.close();
      }
    }
    currentIndex.flush();
  }

  /** Creates the RunsMerger and the RunIteratorFactory */
  protected RunsMerger createtheRunMerger() {
    //logger.info("creating run merged with fields="+useFieldInformation);
    runIteratorF =
      new HadoopRunIteratorFactory(null,
        (useFieldInformation
          ? FieldPostingInRun.class
          : SimplePostingInRun.class),
        super.numFields);
    HadoopRunsMerger tempRM = new HadoopRunsMerger(runIteratorF);
    try{
      tempRM.setBos(new BitOutputStream(
          currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR
          + currentIndex.getPrefix() + ".inverted" + BitIn.USUAL_EXTENSION));
    } catch (IOException ioe) {
      ioe.printStackTrace();
    }
    return (RunsMerger)tempRM;
  }

}
TOP

Related Classes of org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.